/*******************************************************************************

Name: createMatchedPanel.do

Authors: Jonathan Baker, Lori Bennear, Sheila Olmstead

Input file: Primary panel: natlCWS_yr_Panel_mod.dta

Note: this code is not necessary to replicate the results, but we include it
	  nevertheless to show in detail our matching logic.

*******************************************************************************/


* ========================== START START START ==========================
clear
clear matrix
set more off, perm

* ______________________________________________________________________________
* set paths and create log file
global root "C:/Users/jbaker/Documents/Research/SDWIS/Analysis/STATA"
global input "$root/Input"
global intermediate "$root/Intermediate"
global log "$root/LogFiles"
global output "$root/Output"
global outpanel "$root/OutputPanel"
global figures "$root/Figures"

log using "$log/createMatchedPanel.log", replace name(matchpanel)

* ============================================================================== 

* Step 0: preliminaries
*	- define panel variables
*	- define violations per capita and a few indicator variables
*	- reduce size of panel

* ==============================================================================
use "$outpanel/natlCWS_yr_Panel_mod.dta", clear
xtset pwsidNUM year

gen vpc = (HLTH_999 / pop) * 1000 // health violations per 1000 people
la var vpc "health violations per 1000 people"
egen v_avg = mean(HLTH_999) if id2001==1, by(pwsidNUM)
egen vpc_avg = mean(vpc) if id2001==1, by(pwsidNUM)
egen t0_v_avg = mean(HLTH_999) if id2001==1 & year < 1998, by(pwsidNUM)
egen t0_vpc_avg = mean(vpc) if id2001==1 & year < 1998, by(pwsidNUM)

la var v_avg "average violations"
la var vpc_avg "average violations per 1000 customers"
la var t0_v_avg "pre-treatment average violations"
la var t0_vpc_avg "pre-treatment average violations per 1000 customers" 

gen wholesale = 0
replace wholesale = 1 if is_wholesaler == "Y"
gen school = 0
replace school = 1 if is_school == "Y"
la var wholesale "wholesaler"
la var school "school or daycare"

drop is_* epa_region // same as epa_reg

* save some space
foreach i of numlist 110 121 122 123 130 140 210 220 230 310 320 331 332 333 340 350 410 420 430 500 {
	drop MCL_`i' MRDL_`i' TT_`i' MR_`i' OTH_`i' ALL_`i' HLTH_`i'
	drop I_MCL_`i' I_ALL_`i' I_HLTH_`i'
	}

* create restricted mailing and publishing treatment variables
gen T_501_restricted = T_501

gen T_10k_restricted = T_10k
replace T_10k_restricted = . if population_served_count < 501

gen T_100k_restricted = T_100k
replace T_100k_restricted = . if population_served_count < 10000

sort pwsidNUM year
compress
la da "Natl CWS annual panel without individual rules - restricted mail/web indicators"
save "$outpanel/natlCWS_yr_Panel_mod-small_v2.dta", replace

* ============================================================================== 
* Step 1: create summary table of unmatched panel
* ==============================================================================
use "$outpanel/natlCWS_yr_Panel_mod-small_v2.dta", clear

// Table of system characteristics without matching
keep if id2001==1 & year == 1990
local rows epa_reg surface public purchaser wholesale school t0_v_avg // west vpc_avg 
local rowmax = 8
mat T = J(`rowmax',11,.)

* populate matrix 
local i = 0
foreach v of varlist `rows' {
	local i = `i' + 1
	* T_501_restricted
		quietly summarize `v'
		mat T[`i',1] = r(mean)
		quietly ttest `v', by(T_501_restricted)
		display "`v' (publishing threshold): p value of difference `r(p)'"
		mat T[`i',2] = r(mu_1)
		mat T[`i',3] = r(mu_2)
		mat T[`i',4] = r(p)
		mat T[`rowmax',1] = `r(N_1)' + `r(N_2)'
		mat T[`rowmax',2] = `r(N_1)'
		mat T[`rowmax',3] = `r(N_2)'
	* T_10k_restricted
		quietly ttest `v', by(T_10k_restricted)
		display "`v' (mailing threshold): p value of difference `r(p)'"
		mat T[`i',5] = r(mu_1)
		mat T[`i',6] = r(mu_2)
		mat T[`i',7] = r(p)
		mat T[`rowmax',5] = `r(N_1)'
		mat T[`rowmax',6] = `r(N_2)'
	* T_100k_restricted
		quietly ttest `v', by(T_100k_restricted)
		display "`v' (online threshold): p value of difference `r(p)'"
		mat T[`i',8] = r(mu_1)
		mat T[`i',9] = r(mu_2)
		mat T[`i',10] = r(p)
		mat T[`rowmax',8] = `r(N_1)'
		mat T[`rowmax',9] = `r(N_2)'
}
	* output matrix to table
	local rnames "EPA region" "Surface Water" "Public" "Purchaser" "Wholesaler" "School or Daycare" "Mean Pre-Treat Viol" "Observations" // "West" "Mean Viol/1000"
	local tbloption1 "tex fragment replace coljust(rl) hlines(101{0}11) spacebef(111{0}1)"
	local tbloption2 "sdec(2\2\2\2\2\2\3\0) vlines(00000100100) multicol(1,3,3;1,6,3;1,9,3) " 
	mat rownames T = "`rnames'"
	frmttable using "$output/utilityCharacteristics_v2.tex", statmat(T) `tbloption1' `tbloption2' ///
	ctitle("{\bf id2001}","","\emph{Publishing}","","","\emph{Mailing}","","","\emph{Online}","","" \ ///
	"","Full","\(T=0\)","\(T=1\)","p-val","\(T=0\)","\(T=1\)","p-val","\(T=0\)","\(T=1\)","p-val") 

* ============================================================================== 
* Step 2: Create matches for publishing, mailing, and online threshold
* ==============================================================================
gen dummy_ovar = runiform() // generate a dummy outcome variable for teffects nnmatch
sort pwsidNUM
save "$intermediate/temp-prematching.dta", replace

** Publishing Threshold --------------------------------------------------------
	use "$intermediate/temp-prematching.dta", clear
	drop if T_501_restricted == .
	gen obs_id = _n
	teffects nnmatch (dummy_ovar epa_reg surface public purchaser t0_v_avg) (T_501), metric(mahalanobis) ///
		nn(1) vce(iid) gen(matchid_501_) osample(unmatched_501) ematch(epa_reg surface public purchaser)
		display "*** teffects just finished, threshold is T_501, and maximum matches = `e(k_nnmax)' ***"
		
		gen max = 1
		forvalues i=2/`e(k_nnmax)' { // extract the maximum number of matches for each observation
			quietly replace max = `i' if matchid_501_`i' !=.
			} 
		gen temp_id = round(runiform(1-0.5,max+0.5)) // randomly select a number from 1 to the maximum number of matches for each obs.
			// subtracting and adding 0.5 ensures that 1 and 'max' have equal chance of being selected as 2 through 'max-1'
		gen rmatch_id = .
		forvalues i = 1/`e(k_nnmax)' { // extract a random match from the list of possible matches
			quietly replace rmatch_id = matchid_501_`i' if temp_id == `i'
			}
		drop matchid_501_*
		
		preserve // save the pwsid's that match with the treatment group
		keep if T_501 == 1
		keep rmatch_id
		rename rmatch_id obs_id
		duplicates tag obs_id, gen(freq_501)
		replace freq_501 = freq_501+1
		duplicates drop obs_id, force
		sort obs_id
		save "$intermediate/temp_match_501.dta", replace
		restore
		
		preserve // save the control group pwsid's that match with the treatment group 
		keep if T_501 == 0
		keep obs_id pwsidNUM
		rename (obs_id pwsidNUM) (rmatch_id pwsid_match_501)
		duplicates report, rmatch_id
		sort rmatch_id
		save "$intermediate/temp_match2_501.dta", replace
		restore
		
		merge 1:1 obs_id using "$intermediate/temp_match_501.dta" // merge matched control observation id's back into panel
			drop if _merge == 1 & T_501 == 0 // drop "control" systems that were not matched to a treatment system
			drop if _merge == 2 			 // treatment systems that are not matched show up as "." in the temp file
			rename _merge merge1
		
		merge m:1 rmatch_id using "$intermediate/temp_match2_501.dta" // attach the control pwsid to its treated match
			drop if _merge == 2 // drop the "control" systems that were not matched to a treatment (note: = to first drop after first merge above)
			rename _merge merge2
		
		tab merge1 merge2
			// if obs is master only in merge1, it is a treated obs that SHOULD be matched to its control pwsid in merge2
			// if obs is matched in merge1, its a control obs that should NOT be matched in merge2
		
		tab T_501 rmatch_id if merge1==merge2, m // these treated systems do not have a matched control system
			list pwsidNUM epa_reg public purchaser surface t0_v_avg rmatch_id T_501 if merge1==merge2
			drop if T_501 == 1 & rmatch_id == . // these observations do not have a matched control system

		*sort and save final dataset of pwsid's and matched pwsid's (along with frequency of control pwsid's)
			tab freq T_501, m
				// frequency is missing only for treated units
				replace freq_501 = 1 if freq_501==.
			keep pwsidNUM pwsid_match_501 freq_501
			sort pwsidNUM
			la data "Publishing: matched treated obs with matched controls (with cntrl freq)"
			save "$outpanel/matchid-T_501.dta", replace
			
** Mailing Threshold -----------------------------------------------------------
	use "$intermediate/temp-prematching.dta", clear
	drop if T_10k_restricted == .
	gen obs_id = _n
	capture teffects nnmatch (dummy_ovar epa_reg surface public purchaser t0_v_avg) (T_10k), metric(mahalanobis) ///
		nn(1) vce(iid) gen(matchid_10k_) osample(unmatched_10k) ematch(epa_reg surface public purchaser)
		tab T_10k unmatched_10k, m
			// all unmatched observations are "control" observations - I can delete these
		list pwsid epa_reg surface public purchaser if unmatched_10k == .
			// the missing observations had missing data in the match variables
			drop if unmatched_10k == 1 | unmatched_10k == .
			drop unmatched_10k obs_id
	
	gen obs_id = _n
	teffects nnmatch (dummy_ovar epa_reg surface public purchaser t0_v_avg) (T_10k), metric(mahalanobis) ///
		nn(1) vce(iid) gen(matchid_10k_) osample(unmatched_10k) ematch(epa_reg surface public purchaser)
		display "*** teffects just finished, threshold is T_10k, and maximum matches = `e(k_nnmax)' ***"
		
		gen max = 1
		forvalues i=2/`e(k_nnmax)' { // extract the maximum number of matches for each observation
			quietly replace max = `i' if matchid_10k_`i' !=.
			} 
		gen temp_id = round(runiform(1-0.5,max+0.5)) // randomly select a number from 1 to the maximum number of matches for each obs.
			// subtracting and adding 0.5 ensures that 1 and 'max' have equal chance of being selected as 2 through 'max-1'
		gen rmatch_id = .
		forvalues i = 1/`e(k_nnmax)' { // extract a random match from the list of possible matches
			quietly replace rmatch_id = matchid_10k_`i' if temp_id == `i'
			}
		drop matchid_10k_*
		
		preserve // save the pwsid's that match with the treatment group
		keep if T_10k == 1
		keep rmatch_id
		rename rmatch_id obs_id
		duplicates tag obs_id, gen(freq_10k)
		replace freq_10k = freq_10k+1
		duplicates drop obs_id, force
		sort obs_id
		save "$intermediate/temp_match_10k.dta", replace
		restore
		
		preserve // save the control group pwsid's that match with the treatment group 
		keep if T_10k == 0
		keep obs_id pwsidNUM
		rename (obs_id pwsidNUM) (rmatch_id pwsid_match_10k)
		duplicates report, rmatch_id
		sort rmatch_id
		save "$intermediate/temp_match2_10k.dta", replace
		restore
		
		merge 1:1 obs_id using "$intermediate/temp_match_10k.dta" // merge matched control observation id's back into panel
			drop if _merge == 1 & T_10k == 0 // drop "control" systems that were not matched to a treatment system
			drop if _merge == 2 			 // treatment systems that are not matched show up as "." in the temp file
			rename _merge merge1
		
		merge m:1 rmatch_id using "$intermediate/temp_match2_10k.dta" // attach the control pwsid to its treated match
			drop if _merge == 2 // drop the "control" systems that were not matched to a treatment (note: = to first drop after first merge above)
			rename _merge merge2
		
		tab merge1 merge2
			// if obs is master only in merge1, it is a treated obs that SHOULD be matched to its control pwsid in merge2
			// if obs is matched in merge1, its a control obs that should NOT be matched in merge2
		
		tab T_10k rmatch_id if merge1==merge2, m // these treated systems do not have a matched control system
			list pwsidNUM epa_reg public purchaser surface t0_v_avg rmatch_id T_10k if merge1==merge2
			drop if T_10k == 1 & rmatch_id == . // these observations do not have a matched control system

		*sort and save final dataset of pwsid's and matched pwsid's (along with frequency of control pwsid's)
			tab freq T_10k, m
				// frequency is missing only for treated units
				replace freq_10k = 1 if freq_10k==.
			keep pwsidNUM pwsid_match_10k freq_10k
			sort pwsidNUM
			la data "Mailing: matched treated obs with matched controls (with cntrl freq)"
			save "$outpanel/matchid-T_10k.dta", replace
			
** Online Threshold ------------------------------------------------------------
	use "$intermediate/temp-prematching.dta", clear
	drop if T_100k_restricted == .
	gen obs_id = _n
	capture teffects nnmatch (dummy_ovar epa_reg surface public purchaser t0_v_avg) (T_100k), metric(mahalanobis) ///
		nn(1) vce(iid) gen(matchid_100k_) osample(unmatched_100k) ematch(epa_reg surface public purchaser)
		tab T_100k unmatched_100k, m
			// all unmatched observations are "control" observations - I can delete these
			drop if unmatched_100k == 1
			drop unmatched_100k obs_id
			
	gen obs_id = _n
	teffects nnmatch (dummy_ovar epa_reg surface public purchaser t0_v_avg) (T_100k), metric(mahalanobis) ///
		nn(1) vce(iid) gen(matchid_100k_) osample(unmatched_100k) ematch(epa_reg surface public purchaser)
		display "*** teffects just finished, threshold is T_100k, and maximum matches = `e(k_nnmax)' ***"

		gen max = 1
		forvalues i=2/`e(k_nnmax)' { // extract the maximum number of matches for each observation
			quietly replace max = `i' if matchid_100k_`i' !=.
			} 
		gen temp_id = round(runiform(1-0.5,max+0.5)) // randomly select a number from 1 to the maximum number of matches for each obs.
			// subtracting and adding 0.5 ensures that 1 and 'max' have equal chance of being selected as 2 through 'max-1'
		gen rmatch_id = .
		forvalues i = 1/`e(k_nnmax)' { // extract a random match from the list of possible matches
			quietly replace rmatch_id = matchid_100k_`i' if temp_id == `i'
			}
		drop matchid_100k_*
		
		preserve // save the pwsid's that match with the treatment group
		keep if T_100k == 1
		keep rmatch_id
		rename rmatch_id obs_id
		duplicates tag obs_id, gen(freq_100k)
		replace freq_100k = freq_100k+1
		duplicates drop obs_id, force
		sort obs_id
		save "$intermediate/temp_match_100k.dta", replace
		restore
		
		preserve // save the control group pwsid's that match with the treatment group 
		keep if T_100k == 0
		keep obs_id pwsidNUM
		rename (obs_id pwsidNUM) (rmatch_id pwsid_match_100k)
		duplicates report, rmatch_id
		sort rmatch_id
		save "$intermediate/temp_match2_100k.dta", replace
		restore
		
		merge 1:1 obs_id using "$intermediate/temp_match_100k.dta" // merge matched control observation id's back into panel
			drop if _merge == 1 & T_100k == 0 // drop "control" systems that were not matched to a treatment system
			drop if _merge == 2 			 // treatment systems that are not matched show up as "." in the temp file
			rename _merge merge1
		
		merge m:1 rmatch_id using "$intermediate/temp_match2_100k.dta" // attach the control pwsid to its treated match
			drop if _merge == 2 // drop the "control" systems that were not matched to a treatment (note: = to first drop after first merge above)
			rename _merge merge2
		
		tab merge1 merge2
			// if obs is master only in merge1, it is a treated obs that SHOULD be matched to its control pwsid in merge2
			// if obs is matched in merge1, its a control obs that should NOT be matched in merge2
		
		tab T_100k rmatch_id if merge1==merge2, m // these treated systems do not have a matched control system
			list pwsidNUM epa_reg public purchaser surface t0_v_avg rmatch_id T_100k if merge1==merge2
			drop if T_100k == 1 & rmatch_id == . // these observations do not have a matched control system

		*sort and save final dataset of pwsid's and matched pwsid's (along with frequency of control pwsid's)
			tab freq T_100k, m
				// frequency is missing only for treated units
				replace freq_100k = 1 if freq_100k==.
			keep pwsidNUM pwsid_match_100k freq_100k
			sort pwsidNUM
			la data "Online: matched treated obs with associated matched controls (with cntrl freq)"
			save "$outpanel/matchid-T_100k.dta", replace

* ============================================================================== 
* Step 3: Merge matches for each threshold back into water systems panel
* ==============================================================================
			
// Merge matched IDs back into main dataset and recreate summary stats table
use "$outpanel/natlCWS_yr_Panel_mod-small_v2.dta", clear

	merge m:1 pwsidNUM using "$outpanel/matchid-T_501.dta"
		gen mflag501 = 0
		replace mflag501 = 1 if _merge == 3
		drop _merge
		
	sort pwsidNUM year
	merge m:1 pwsidNUM using "$outpanel/matchid-T_10k.dta"
		gen mflag10k = 0
		replace mflag10k = 1 if _merge == 3
		drop _merge

	sort pwsidNUM year
	merge m:1 pwsidNUM using "$outpanel/matchid-T_100k.dta"
		gen mflag100k = 0
		replace mflag100k = 1 if _merge == 3
		drop _merge

	sort pwsidNUM year
	compress
	la da "National CWS annual panel (small) for matching"
	save "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", replace
	
* ============================================================================== 
* Step 4: Table of system characteristics without matching
* ==============================================================================
set more off
local rows epa_reg surface public purchaser wholesale school t0_v_avg // west vpc_avg 
local rowmax = 8
mat T = J(`rowmax',11,.)

* populate matrix
	* Full sample - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
	keep if id2001==1 & year == 1990
	local i = 0
	foreach v of varlist `rows' {
		local i = `i' + 1
		quietly summarize `v'
		mat T[`i',1] = r(mean)
		mat T[`rowmax',1] = r(N)
		}
	* T_501 - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
	keep if id2001==1 & year == 1990 & mflag501 == 1
	expand freq_501
	sort pwsidNUM
	local i = 0
	foreach v of varlist `rows' {
		local i = `i' + 1
		quietly ttest `v' if mflag501 == 1, by(T_501)
		display "`v' (publishing threshold): p value of difference `r(p)'"
		mat T[`i',2] = r(mu_1)
		mat T[`i',3] = r(mu_2)
		mat T[`i',4] = r(p)
		mat T[`rowmax',2] = r(N_1)
		mat T[`rowmax',3] = r(N_2)
		}
	// T_10k - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
	keep if id2001==1 & year == 1990 & mflag10k == 1
	expand freq_10k
	sort pwsidNUM
	local i = 0
	foreach v of varlist `rows' {
		local i = `i' + 1
		quietly ttest `v' if mflag10k == 1, by(T_10k)
		display "`v' (mailing threshold): p value of difference `r(p)'"
		mat T[`i',5] = r(mu_1)
		mat T[`i',6] = r(mu_2)
		mat T[`i',7] = r(p)
		mat T[`rowmax',5] = r(N_1)
		mat T[`rowmax',6] = r(N_2)
		}
	// T_100k - - - - - - - - - - - - - - - - - - - - - - - - - - - -
	use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
	keep if id2001==1 & year == 1990 & mflag100k == 1
	expand freq_100k
	sort pwsidNUM
	local i = 0
	foreach v of varlist `rows' {
		local i = `i' + 1
		quietly ttest `v' if mflag100k == 1, by(T_100k)
		display "`v' (online threshold): p value of difference `r(p)'"
		mat T[`i',8] = r(mu_1)
		mat T[`i',9] = r(mu_2)
		mat T[`i',10] = r(p)
		mat T[`rowmax',8] = r(N_1)
		mat T[`rowmax',9] = r(N_2)
	}
	* output matrix to table
	local rnames "EPA region" "Surface Water" "Public" "Purchaser" "Wholesaler" "School or Daycare" "Mean Pre-Treat Viol" "Observations" // "West" "Mean Viol/1000"
	local tbloption1 "tex fragment replace coljust(rl) hlines(101{0}11) spacebef(111{0}1)"
	local tbloption2 "sdec(2\2\2\2\2\2\3\0) vlines(00000100100) multicol(1,3,3;1,6,3;1,9,3) " 
	mat rownames T = "`rnames'"
	frmttable using "$output/utilityCharacteristics-match_v2.tex", statmat(T) `tbloption1' `tbloption2' ///
	ctitle("{\bf id2001}","","\emph{Publishing}","","","\emph{Mailing}","","","\emph{Online}","","" \ ///
	"","Full","\(T=0\)","\(T=1\)","p-val","\(T=0\)","\(T=1\)","p-val","\(T=0\)","\(T=1\)","p-val")

* ============================================================================== 
* Step 5: Clear temporary datasets
* ==============================================================================
erase "$intermediate/temp-prematching.dta"
erase "$intermediate/temp_match_501.dta"
erase "$intermediate/temp_match2_501.dta"
erase "$intermediate/temp_match_10k.dta"
erase "$intermediate/temp_match2_10k.dta"
erase "$intermediate/temp_match_100k.dta"
erase "$intermediate/temp_match2_100k.dta"

* ========================== DONE DONE DONE ==========================
log close matchpanel


